In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import warnings
import enclus
warnings.filterwarnings('ignore')

sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.settings.figdir = './Result/hypothalamic/'

uniPort¶

In [24]:
import uniport as up
print(up.__version__)

st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')

sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()

sc_data.var_names_make_unique()
sc_data.var_names_make_unique()

st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'

sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'

sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)
sc.pp.highly_variable_genes(st_data, n_top_genes=2000, inplace=True, subset=True)
up.batch_scale(st_data)

sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
sc.pp.highly_variable_genes(sc_data, n_top_genes=2000, inplace=True, subset=True)
up.batch_scale(sc_data)

print(sc_data,st_data)
# st_data.write_h5ad('./datasets/ProcessedMERFISH_hypothalamic_st_data.h5ad')
# sc_data.write_h5ad('./datasets/ProcessedMERFISH_hypothalamic_sc_data.h5ad')
1.2.2
AnnData object with n_obs × n_vars = 30370 × 2000
    obs: 'cell_type', 'domain_id', 'source'
    var: 'Gene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg' AnnData object with n_obs × n_vars = 64373 × 155
    obs: 'cell_type', 'domain_id', 'source'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    obsm: 'spatial'
In [6]:
adata_merfish = sc.read('./datasets/ProcessedMERFISH_hypothalamic_st_data.h5ad')
adata_rna = sc.read('./datasets/ProcessedMERFISH_hypothalamic_sc_data.h5ad')
adata_cm = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
In [ ]:
adata = up.Run(adatas=[adata_merfish, adata_rna], adata_cm=adata_cm, lambda_kl=5.0)
Device: cpu
Dataset 0: MERFISH
AnnData object with n_obs × n_vars = 64373 × 155
    obs: 'cell_type', 'domain_id', 'source'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
    obsm: 'spatial'
Dataset 1: RNA
AnnData object with n_obs × n_vars = 30370 × 2000
    obs: 'cell_type', 'domain_id', 'source'
    var: 'Gene', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'
Reference dataset is dataset 1


Data with common HVG
AnnData object with n_obs × n_vars = 94743 × 154
    obs: 'cell_type', 'domain_id', 'source'
    var: 'Gene-1'
    uns: 'log1p'


Epochs: 100%|██████████| 82/82 [45:23<00:00, 33.22s/it, recloss=791.54,klloss=29.45,otloss=4.24] 
In [11]:
sc.pp.neighbors(adata, use_rep='latent')
sc.tl.umap(adata, min_dist=0.1)
sc.pl.umap(adata, color=['source', 'cell_type'],save='uniport-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapuniport-integrate.pdf
No description has been provided for this image
In [ ]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']=='0']
adata2 = adata[adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_umap'], adata.obs['cell_type']))
F1: 0.871461070953648
ARI: 0.7429336420387456
NMI: 0.732073753207167
total score: 2.3464684661995605
Batch Entropy: 0.47998895178474116
Silhouette: 0.693193793296814

MultiMAP¶

In [3]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')

sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()

sc_data.var_names_make_unique()
sc_data.var_names_make_unique()

st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'

sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'

sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)

sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)

sc.pp.scale(st_data)
sc.pp.pca(st_data)

sc.pp.scale(sc_data)
sc.pp.pca(sc_data)
In [4]:
sc_data,st_data
Out[4]:
(AnnData object with n_obs × n_vars = 30370 × 27998
     obs: 'cell_type', 'domain_id', 'source'
     var: 'Gene', 'mean', 'std'
     uns: 'log1p', 'pca'
     obsm: 'X_pca'
     varm: 'PCs',
 AnnData object with n_obs × n_vars = 64373 × 155
     obs: 'cell_type', 'domain_id', 'source'
     var: 'mean', 'std'
     uns: 'log1p', 'pca'
     obsm: 'spatial', 'X_pca'
     varm: 'PCs')
In [5]:
from integration.MultiMAP.model import *
adata = Integration([st_data, sc_data], ['X_pca', 'X_pca'])
In [ ]:
sc.pl.embedding(adata, 'X_multimap', color=['source','cell_type'],save='MultiMAP-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\X_multimapMultiMAP-integrate.pdf
d:\Program Files\Anaconda3\envs\integrate\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
d:\Program Files\Anaconda3\envs\integrate\lib\site-packages\scanpy\plotting\_tools\scatterplots.py:394: UserWarning: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored
  cax = scatter(
No description has been provided for this image
In [ ]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']==0]
adata2 = adata[adata.obs['domain_id']==1]
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_multimap')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_multimap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_multimap'], adata.obs['cell_type']))
F1: 0.8808136163876433
ARI: 0.7628832260492341
NMI: 0.7679373319525135
total score: 2.4116341743893908
Batch Entropy: 0.514425433948172
Silhouette: 0.6702606976032257

gimVI¶

In [2]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')

sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()

sc_data.var_names_make_unique()
sc_data.var_names_make_unique()

st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'

sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'

sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)

sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
In [3]:
from baseline.scvi.model import GIMVI
import baseline.scvi as scvi

train_size = 0.8
# only use genes in both datasets
common_gene = np.intersect1d(st_data.var_names,sc_data.var_names)
st_data = st_data[:,common_gene]
seq_data = sc_data[:, common_gene]

seq_gene_names = seq_data.var_names
n_genes = seq_data.n_vars
n_train_genes = int(n_genes * train_size)

# randomly select training_genes
rand_train_gene_idx = np.random.choice(range(n_genes), n_train_genes, replace=False)
rand_test_gene_idx = sorted(set(range(n_genes)) - set(rand_train_gene_idx))
rand_train_genes = seq_gene_names[rand_train_gene_idx]
rand_test_genes = seq_gene_names[rand_test_gene_idx]

# spatial_data_partial has a subset of the genes to train on
spatial_data_partial = st_data[:, rand_train_genes].copy()

# remove cells with no counts
sc.pp.filter_cells(spatial_data_partial, min_counts=1)
sc.pp.filter_cells(seq_data, min_counts=1)

# setup_anndata for spatial and sequencing data
scvi.data.setup_anndata(spatial_data_partial)
scvi.data.setup_anndata(seq_data)

# spatial_data should use the same cells as our training data
# cells may have been removed by scanpy.pp.filter_cells()
spatial_data = st_data[spatial_data_partial.obs_names]
INFO     'scvi' logger already has a StreamHandler, set its level to 20.                     
In [4]:
model = GIMVI(seq_data, spatial_data_partial)
model.train(max_epochs=200)
Training...: 100%|██████████| 200/200 [31:44<00:00,  9.52s/it]
In [5]:
seq_data,st_data,spatial_data_partial
Out[5]:
(AnnData object with n_obs × n_vars = 30367 × 154
     obs: 'cell_type', 'domain_id', 'source', 'n_counts', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
     var: 'Gene'
     uns: 'log1p', '_scvi',
 View of AnnData object with n_obs × n_vars = 64373 × 154
     obs: 'cell_type', 'domain_id', 'source'
     uns: 'log1p'
     obsm: 'spatial',
 AnnData object with n_obs × n_vars = 64372 × 123
     obs: 'cell_type', 'domain_id', 'source', 'n_counts', '_scvi_batch', '_scvi_labels', '_scvi_local_l_mean', '_scvi_local_l_var'
     uns: 'log1p', '_scvi'
     obsm: 'spatial')
In [8]:
import anndata
# get the latent representations for the sequencing and spatial data
latent_seq, latent_spatial = model.get_latent_representation()

# concatenate to one latent representation
latent_representation = np.concatenate([latent_spatial,latent_seq])
latent_adata = anndata.AnnData(latent_representation)

# labels which cells were from the sequencing dataset and which were from the spatial dataset
latent_labels =(["MERFISH"] * latent_spatial.shape[0]) + (["RNA"] * latent_seq.shape[0])
latent_adata.obs["labels"] = latent_labels
latent_adata.obs["cell_type"] = spatial_data_partial.obs['cell_type'].to_list()+seq_data.obs['cell_type'].to_list()
latent_adata.obs["domain_id"] =(["0"] * latent_spatial.shape[0]) + (["1"] * latent_seq.shape[0])

# compute umap
sc.pp.neighbors(latent_adata, use_rep="X")
sc.tl.umap(latent_adata)

# save umap representations to original seq and spatial_datasets
seq_data.obsm["X_umap"] = latent_adata.obsm["X_umap"][: seq_data.shape[0]]
spatial_data.obsm["X_umap"] = latent_adata.obsm["X_umap"][seq_data.shape[0] :]
In [3]:
# umap of the combined latent space
sc.pl.umap(latent_adata, color=['labels', 'cell_type'],save='gimVI-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapgimVI-integrate.pdf
No description has been provided for this image
In [10]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = latent_adata[latent_adata.obs['domain_id']=='0']
adata2 = latent_adata[latent_adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(latent_adata.obsm['X_umap'], latent_adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(latent_adata.obsm['X_umap'], latent_adata.obs['cell_type']))
F1: 0.6458704940597764
ARI: 0.625984661312957
NMI: 0.6066587759777841
total score: 1.8785139313505175
Batch Entropy: 0.5944166602436534
Silhouette: 0.5858384892344475

SCALEX¶

In [2]:
import scalex
from scalex.function import SCALEX
from scalex.plot import embedding
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
In [17]:
adata_cm = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
adata_cm.obs['batch'] = adata_cm.obs['source']
adata_cm.write_h5ad('./datasets/scalex_hypo_adata.h5ad')
wk_dir = './datasets/'
In [23]:
adata = SCALEX(data_list = [wk_dir+'scalex_hypo_adata.h5ad'],
              min_features=0, 
              min_cells=0, 
              outdir='./Result/hypothalamic/',
              show=False,
              gpu=0)
2025-03-12 23:31:13,175 - SCALEX - INFO - Raw dataset shape: (94743, 154)
2025-03-12 23:31:13,175 - SCALEX - INFO - Raw dataset shape: (94743, 154)
2025-03-12 23:31:13,177 - SCALEX - INFO - There are 2 batches under batch_name: batch
2025-03-12 23:31:13,177 - SCALEX - INFO - There are 2 batches under batch_name: batch
2025-03-12 23:31:13,178 - SCALEX - INFO - Preprocessing
2025-03-12 23:31:13,178 - SCALEX - INFO - Preprocessing
2025-03-12 23:31:13,186 - SCALEX - INFO - Filtering cells
2025-03-12 23:31:13,186 - SCALEX - INFO - Filtering cells
2025-03-12 23:31:13,365 - SCALEX - INFO - Filtering features
2025-03-12 23:31:13,365 - SCALEX - INFO - Filtering features
2025-03-12 23:31:13,497 - SCALEX - INFO - Normalizing total per cell
2025-03-12 23:31:13,497 - SCALEX - INFO - Normalizing total per cell
2025-03-12 23:31:13,526 - SCALEX - INFO - Log1p transforming
2025-03-12 23:31:13,526 - SCALEX - INFO - Log1p transforming
WARNING: adata.X seems to be already log-transformed.
2025-03-12 23:31:13,552 - SCALEX - INFO - Finding variable features
2025-03-12 23:31:13,552 - SCALEX - INFO - Finding variable features
2025-03-12 23:31:13,789 - SCALEX - INFO - Batch specific maxabs scaling
2025-03-12 23:31:13,789 - SCALEX - INFO - Batch specific maxabs scaling
2025-03-12 23:31:14,986 - SCALEX - INFO - Processed dataset shape: (94743, 154)
2025-03-12 23:31:14,986 - SCALEX - INFO - Processed dataset shape: (94743, 154)
Epochs: 100%|██████████| 21/21 [06:54<00:00, 19.73s/it, recon_loss=58.260,kl_loss=2.497]
2025-03-12 23:38:20,754 - SCALEX - INFO - Plot umap
2025-03-12 23:38:20,754 - SCALEX - INFO - Plot umap
WARNING: saving figure to file Result\hypothalamic\umap.png
In [26]:
sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.pl.umap(adata, color=['batch', 'cell_type'],save='SCALEX-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapSCALEX-integrate.pdf
No description has been provided for this image
In [30]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']=='0']
adata2 = adata[adata.obs['domain_id']=='1']
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_scalex_umap')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_scalex_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_scalex_umap'], adata.obs['cell_type']))
F1: 0.7510572103225998
ARI: 0.5705643773958746
NMI: 0.5728128818126889
total score: 1.8944344695311632
Batch Entropy: 0.5145418383234401
Silhouette: 0.6278813630342484

scVI¶

In [11]:
import anndata
import scanpy as sc
import scvi
import warnings
warnings.filterwarnings('ignore')
sc.set_figure_params(dpi=300,fontsize=10,dpi_save=300,facecolor='white',format='pdf')
sc.settings.figdir = './Result/hypothalamic/'
In [12]:
st_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_st_data.h5ad')
sc_data = sc.read('./datasets/hypothalamic/MERFISH_hypothalamic_sc_data.h5ad')

sc_data.obs_names_make_unique()
st_data.obs_names_make_unique()

sc_data.var_names_make_unique()
sc_data.var_names_make_unique()

st_data.obs['domain_id'] = 0
st_data.obs['domain_id'] = st_data.obs['domain_id'].astype('category')
st_data.obs['source'] = 'MERFISH'

sc_data.obs['domain_id'] = 1
sc_data.obs['domain_id'] = sc_data.obs['domain_id'].astype('category')
sc_data.obs['source'] = 'RNA'

sc.pp.normalize_total(st_data)
sc.pp.log1p(st_data)

sc.pp.normalize_total(sc_data)
sc.pp.log1p(sc_data)
In [29]:
adata = anndata.concat([st_data,sc_data])
scvi.model.SCVI.setup_anndata(adata,  batch_key="source")
scvi_model = scvi.model.SCVI(adata, n_layers=2, n_latent=30)
scvi_model.train()
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Epoch 1/84:   0%|          | 0/84 [00:00<?, ?it/s]
Epoch 84/84: 100%|██████████| 84/84 [10:42<00:00,  7.90s/it, v_num=1, train_loss_step=67.9, train_loss_epoch=74.5]
`Trainer.fit` stopped: `max_epochs=84` reached.
Epoch 84/84: 100%|██████████| 84/84 [10:42<00:00,  7.65s/it, v_num=1, train_loss_step=67.9, train_loss_epoch=74.5]
In [30]:
SCVI_LATENT_KEY = "X_scVI"
adata.obsm[SCVI_LATENT_KEY] = scvi_model.get_latent_representation()
sc.pp.neighbors(adata, use_rep=SCVI_LATENT_KEY)
sc.tl.umap(adata, min_dist=0.1)
In [31]:
sc.pl.umap(adata, color=['source', 'cell_type'],save='scvi-integrate.pdf',wspace=0.3,legend_fontsize=10)
WARNING: saving figure to file Result\hypothalamic\umapscvi-integrate.pdf
No description has been provided for this image
In [37]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['domain_id']==0]
adata2 = adata[adata.obs['domain_id']==1]
y_test = enclus.metrics.label_transfer(adata2, adata1, label='cell_type', rep='X_umap')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['domain_id']))
print('Silhouette:', enclus.metrics.silhouette(adata.obsm['X_umap'], adata.obs['cell_type']))
F1: 0.817277553437705
ARI: 0.6490055920337885
NMI: 0.6642281616875901
total score: 2.1305113071590838
Batch Entropy: 0.4367643613852369
Silhouette: 0.6226593926548958

Seurat¶

In [ ]:
# Export unstandardized data to R for processing
adata = sc.read('./datasets/MERFISH_hypothalamic_cm_data.h5ad')
adata.to_df().T.to_csv('expression_matrix.csv')
adata.obs['source'].to_csv('batch_info.csv')
adata.obs['cell_type'].to_csv('cell_type_info.csv')
In [6]:
import pandas as pd
import numpy as np
import anndata

# read embedding
embedding = pd.read_csv("./Result/hypothalamic/integrated_cca_embedding.csv", index_col=0)

# read metadata
metadata = pd.read_csv("./Result/hypothalamic/metadata.csv")
metadata.index = metadata["Cell"]  # Set the index as the cell name
metadata = metadata.drop(columns=["Cell"])  

# buid AnnData 
adata = anndata.AnnData(X=embedding)
adata.obs = metadata  # Add batch and cell type information

print(adata)
print(adata.obs.head())
AnnData object with n_obs × n_vars = 94743 × 50
    obs: 'source', 'cell_type'
                                          source   cell_type
Cell                                                        
X6749ccb4.2ed1.4029.968f.820a287f43c8.0  MERFISH  Astrocytes
X6cac74bd.4ea7.4701.8701.42563cc65eb8.0  MERFISH  Inhibitory
X9f29bd57.16a5.4b26.b9f5.37598809da9e.0  MERFISH  Inhibitory
d7eb4e0b.276e.47e3.a55c.0b033180a2fe.0   MERFISH  Inhibitory
X54434f3a.eba9.4aec.af35.c9d317ffa1d5.0  MERFISH  Inhibitory
In [4]:
def label_transfer(ref, query, label='celltype'):
    from sklearn.neighbors import KNeighborsClassifier
    
    X_train = ref.X
    y_train = ref.obs[label]
    X_test = query.X
    
    knn = knn = KNeighborsClassifier().fit(X_train, y_train)
    y_test = knn.predict(X_test)
    
    return y_test
In [8]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['source']=='MERFISH']
adata2 = adata[adata.obs['source']=='RNA']
y_test = label_transfer(adata2, adata1, label='cell_type')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.X, adata.obs['source']))
print('Silhouette:', enclus.metrics.silhouette(adata.X, adata.obs['cell_type']))
F1: 0.8091003178716528
ARI: 0.6508844103152466
NMI: 0.6594581930237735
total score: 2.119442921210673
Batch Entropy: 0.29372671828102115
Silhouette: 0.5137207407478872

Harmony¶

In [9]:
import pandas as pd
import numpy as np
import anndata

embedding = pd.read_csv("./Result/hypothalamic/integrated_harmony_embedding.csv", index_col=0)

metadata = pd.read_csv("./Result/hypothalamic/metadata.csv")
metadata.index = metadata["Cell"]  
metadata = metadata.drop(columns=["Cell"])  

adata = anndata.AnnData(X=embedding)
adata.obs = metadata  

print(adata)
print(adata.obs.head())
AnnData object with n_obs × n_vars = 94743 × 50
    obs: 'source', 'cell_type'
                                          source   cell_type
Cell                                                        
X6749ccb4.2ed1.4029.968f.820a287f43c8.0  MERFISH  Astrocytes
X6cac74bd.4ea7.4701.8701.42563cc65eb8.0  MERFISH  Inhibitory
X9f29bd57.16a5.4b26.b9f5.37598809da9e.0  MERFISH  Inhibitory
d7eb4e0b.276e.47e3.a55c.0b033180a2fe.0   MERFISH  Inhibitory
X54434f3a.eba9.4aec.af35.c9d317ffa1d5.0  MERFISH  Inhibitory
In [10]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score, f1_score
import enclus.metrics
adata1 = adata[adata.obs['source']=='MERFISH']
adata2 = adata[adata.obs['source']=='RNA']
y_test = label_transfer(adata2, adata1, label='cell_type')

f1 = f1_score(adata1.obs['cell_type'], y_test, average='weighted')
ARI = adjusted_rand_score(adata1.obs['cell_type'], y_test)
NMI = normalized_mutual_info_score(adata1.obs['cell_type'], y_test)
print('F1:', f1)
print('ARI:', ARI)
print('NMI:', NMI)
print('total score:',f1+ARI+NMI)
print('Batch Entropy:', enclus.metrics.batch_entropy_mixing_score(adata.X, adata.obs['source']))
print('Silhouette:', enclus.metrics.silhouette(adata.X, adata.obs['cell_type']))
F1: 0.8003951568856976
ARI: 0.6026170137527536
NMI: 0.6698873201850263
total score: 2.0728994908234775
Batch Entropy: 0.15340537230811968
Silhouette: 0.49228478616875704